library(ggplot2)
library(heatmaply)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: viridis
## Loading required package: viridisLite
## 
## ======================
## Welcome to heatmaply version 0.16.0
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## Or contact: <tal.galili@gmail.com>
## ======================
library(Rtsne)
data = read.csv('geneExpression_GDSC.csv', row.names = "X")
data
summary(data$GeneralType)
##    aero_dig_tract              bone            breast  digestive_system 
##                77                35                49                48 
##            kidney   large_intestine          leukemia              lung 
##                33                45                76                22 
##        lung_NSCLC         lung_SCLC          lymphoma           myeloma 
##               108                55                65                17 
##    nervous_system     neuroblastoma          pancreas              skin 
##                55                31                30                53 
##       soft_tissue           thyroid urogenital_system              NA's 
##                19                16               100                 1
summary(data$Type)
##            acute_myeloid_leukaemia                      adrenal_gland 
##                                 24                                  1 
##     anaplastic_large_cell_lymphoma                    B_cell_leukemia 
##                                  3                                 12 
##                    B_cell_lymphoma                      biliary_tract 
##                                 31                                  5 
##                            Bladder                         bone_other 
##                                 19                                  2 
##                             breast                   Burkitt_lymphoma 
##                                 49                                 13 
##                             cervix                     chondrosarcoma 
##                                 14                                  3 
##          chronic_myeloid_leukaemia             digestive_system_other 
##                                 10                                  1 
##                        endometrium                     ewings_sarcoma 
##                                 11                                 21 
##                       fibrosarcoma                             glioma 
##                                  2                                 51 
##      haematopoietic_neoplasm other               hairy_cell_leukaemia 
##                                  6                                  3 
##                      head and neck                   Hodgkin_lymphoma 
##                                 42                                  9 
##                             kidney                    large_intestine 
##                                 32                                 45 
##                           leukemia                              liver 
##                                  3                                 14 
##          lung_NSCLC_adenocarcinoma               lung_NSCLC_carcinoid 
##                                 65                                  4 
##              lung_NSCLC_large cell           lung_NSCLC_not specified 
##                                 13                                 11 
## lung_NSCLC_squamous_cell_carcinoma                         Lung_other 
##                                 15                                  1 
##          lung_small_cell_carcinoma             lymphoblastic_leukemia 
##                                 55                                 11 
##     lymphoblastic_T_cell_leukaemia            lymphoid_neoplasm other 
##                                  8                                 10 
##                    medulloblastoma                           melanoma 
##                                  4                                 50 
##                       mesothelioma                            myeloma 
##                                 21                                 12 
##                      neuroblastoma                         oesophagus 
##                                 31                                 35 
##                       osteosarcoma                              ovary 
##                                  9                                 41 
##                           pancreas                           prostate 
##                                 30                                  7 
##                   rhabdomyosarcoma                         skin_other 
##                                  9                                  3 
##                  soft_tissue_other                            stomach 
##                                  8                                 28 
##                    T_cell_leukemia                             testis 
##                                  3                                  1 
##                            thyroid            urogenital_system_other 
##                                 16                                  4 
##                             uterus                               NA's 
##                                  3                                  1

Visualize your data with ggplot

You can find a cheat sheet ;). https://rstudio.com/resources/cheatsheets/

Let’s create a bar chat showing the number of cell lines per tissue type.

ggplot(data, aes(x=GeneralType, fill=GeneralType)) + geom_bar() +
  theme(axis.text.x = element_text(angle=90, hjust=1,vjust=1))

# what happens if you remove theme?

Let’s try another plot that takes two variables.

ggplot(data, aes(x=GeneralType, y=JUN, fill=GeneralType)) + geom_boxplot() + #geom_point() +
  theme(axis.text.x = element_text(angle=90, hjust=1,vjust=1))

# you can also try other gene?

Clustering analysis

set.seed(1000) # to make sampling reproducible

filter_cell <- data$GeneralType %in% c('breast', 'lung')
filter_gene <- c('Type', 'GeneralType', sample(colnames(data),50))
heatmaply(data[filter_cell, filter_gene], column_text_angle = 90,
          hclust_method = "average") %>% layout(width=500, height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
## Warning: 'heatmap' objects don't have these attributes: 'showlegend'
## Valid attributes include:
## 'type', 'visible', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'z', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'transpose', 'xtype', 'ytype', 'zsmooth', 'connectgaps', 'xgap', 'ygap', 'zhoverformat', 'hovertemplate', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

dimensionality reduction (PCA)

filter_cell <- data$GeneralType %in% c('breast', 'lung', 'pancreas')
PCA = prcomp(data[filter_cell, 3:ncol(data)]) # selecting all numeric columns

barplot(PCA$sdev[1:10]^2, las=2) # amount of variance captured by first 10 components

PCA$x[1:10,1:5]
##                 PC1       PC2        PC3         PC4        PC5
## AU565    -13.724367 -1.871347 -5.3455675   1.9187103 -2.8918538
## BT-20     -5.445793 -5.249889  0.9582700  -1.8191926 -2.2218144
## BT-474   -15.100586  1.672376 -4.5099641  -1.5250846  2.0464121
## BT-483   -17.661129  2.925127  0.4103088   0.5627541  0.5346582
## BT-549     7.192831 12.845869  1.8666512  -3.2586385 -3.1331359
## CAL-120    7.303171  6.230816  2.3337983   1.5738670  4.7528825
## CAL-148  -16.637455  5.462866 -0.1385591   3.0094828  1.4583948
## CAL-51    -0.162678  3.928606  6.1819198   0.4521046  0.6904870
## CAL-85-1  10.393089 -6.561907 -1.3468198 -10.0114684 -4.4460762
## CAMA-1   -17.180801  2.735580 -5.9790370  -0.6497663  1.3056740
df <- as.data.frame(PCA$x)
df$Type = data$GeneralType[filter_cell]

ggplot(df, aes(x=PC1, y=PC2, col=Type)) + geom_point()

clustering with the compressed data

heatmaply(df[,c("PC1", "PC2", "Type")], column_text_angle = 90,
          hclust_method = "average") %>% layout(width=500, height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
## Warning: 'heatmap' objects don't have these attributes: 'showlegend'
## Valid attributes include:
## 'type', 'visible', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'z', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'transpose', 'xtype', 'ytype', 'zsmooth', 'connectgaps', 'xgap', 'ygap', 'zhoverformat', 'hovertemplate', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

Another dimensionality reduction : t-sne

Unlike PCA, t-sne takes local associations than global associations. (only the closest neighbors considered) - perplexity parameter controls the number of closest neighbor considered.

Also, t-sne is not reproducible while PCA is. However, the algorithm gives visually pleasing outcome.

filter_cell <- data$GeneralType %in% c('breast', 'lung', 'pancreas')
tsne = Rtsne(data[filter_cell, 3:ncol(data)], dims=2, perplexity = 30, max_iter=5000
            ) # selecting all numeric columns

df <- as.data.frame(tsne$Y)
df$Type = data$GeneralType[filter_cell]

ggplot(df, aes(x=V1, y=V2, col=Type)) + geom_point()

clustering with the compressed data

heatmaply(df, column_text_angle = 90,
          hclust_method = "average") %>% layout(width=500, height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
## Warning: 'heatmap' objects don't have these attributes: 'showlegend'
## Valid attributes include:
## 'type', 'visible', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'z', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'transpose', 'xtype', 'ytype', 'zsmooth', 'connectgaps', 'xgap', 'ygap', 'zhoverformat', 'hovertemplate', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'